#load packages
library(tidyverse)
library(dplyr)
library(ggplot2)
library(forcats)
library(Hmisc)
library(HH)
library(mi)
library(extracat)
library(tm)
library(rapportools)

library(vcd)
library(plotly)
library(shiny)
# set color
mycolor <- "#80593D"
myfill <- "#9FC29F"
# load data from moma and met
moma_artists <- read_csv("../data/raw/moma/Artists.csv")
moma_artworks <- read_csv("../data/raw/moma/Artworks.csv")
# 1. Missing data
# 1.1 MOMA
# cut out columns that we dont need

str(moma_artists)
## Classes 'tbl_df', 'tbl' and 'data.frame':    15853 obs. of  9 variables:
##  $ ConstituentID: int  1 2 3 4 5 6 7 9 10 11 ...
##  $ DisplayName  : chr  "Robert Arneson" "Doroteo Arnaiz" "Bill Arnold" "Charles Arnoldi" ...
##  $ ArtistBio    : chr  "American, 1930–1992" "Spanish, born 1936" "American, born 1941" "American, born 1946" ...
##  $ Nationality  : chr  "American" "Spanish" "American" "American" ...
##  $ Gender       : chr  "Male" "Male" "Male" "Male" ...
##  $ BeginDate    : int  1930 1936 1941 1946 1941 1925 1941 1923 1918 1886 ...
##  $ EndDate      : int  1992 0 0 0 0 0 0 0 0 1966 ...
##  $ Wiki QID     : chr  NA NA NA "Q1063584" ...
##  $ ULAN         : int  NA NA NA 500027998 NA NA NA 500003363 500042413 500031000 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 9
##   .. ..$ ConstituentID: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ DisplayName  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ ArtistBio    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Nationality  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Gender       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ BeginDate    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ EndDate      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Wiki QID     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ ULAN         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

This is a dataset describing 15,853 observations with 9 columns. However, we can quickly notice that some columns are either redundant or seemingly meaningless for exploratory analysis, such as ConstituentID, ArtistBio (which is combining Nationality, BeginDate and EndDate), Wiki QID, ULAN; as a result, we will just go ahead and drop them.

# we first explore the artists dataset, we only include: DisplayName, Nationality, Gender, BeginDate, and EndDate
moma_artists <- moma_artists[c(2,4,5,6,7)]

str(moma_artists)
## Classes 'tbl_df', 'tbl' and 'data.frame':    15853 obs. of  5 variables:
##  $ DisplayName: chr  "Robert Arneson" "Doroteo Arnaiz" "Bill Arnold" "Charles Arnoldi" ...
##  $ Nationality: chr  "American" "Spanish" "American" "American" ...
##  $ Gender     : chr  "Male" "Male" "Male" "Male" ...
##  $ BeginDate  : int  1930 1936 1941 1946 1941 1925 1941 1923 1918 1886 ...
##  $ EndDate    : int  1992 0 0 0 0 0 0 0 0 1966 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 9
##   .. ..$ ConstituentID: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ DisplayName  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ ArtistBio    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Nationality  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Gender       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ BeginDate    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ EndDate      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Wiki QID     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ ULAN         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

Now, we just have 5 columns with the same number of observations: Artist Display Name, Nationality, Gender, Artist Birth Year, Artist Death Year. We will proceed to clean up the datasets and observed the patterns in the missing data.

# quick scan
# We will do a quick scan first:
colSums(is.na(moma_artists))
## DisplayName Nationality      Gender   BeginDate     EndDate 
##           0        2556        3179           0           0

From the report, it says that only Nationality and Gender are the columns with missing value. This seems odd as it is nearly impossible for BeginDate (Birth Year) and EndDate (Death Year) to not have any missing data. Thus, we go back and eyeball the dataset. From the process, we realize that the missing values in BeginDate and EndDate columns are denoted as 0 instead NA. That said, we will replace the 0s with NAs.

# clean up BeginDate and EndDate columns
moma_clean <- moma_artists
moma_clean$BeginDate[(moma_clean$BeginDate == 0)] <- NA
moma_clean$EndDate[(moma_clean$EndDate == 0)] <- NA

# now check again
colSums(is.na(moma_clean))
## DisplayName Nationality      Gender   BeginDate     EndDate 
##           0        2556        3179        3791       10779

After the clean up, now the NAs are revealed. There are 3,791 missing data in the column BeginDate and 10,799 missing in the EndDate column. This quite literally suggests that with the artists being featured in MOMA. More than half of them (~68%) are missing death years, and less than one-fourth (~24%) of them are missing birth years. Perhaps, there is difficulty with MOMA to collect full bio of the artists being featured in the museum, especially with the death years of the artists.

# group the other three columns and see if there is any NAs that are not tracked by NA due to human error while entering the data

nationality_na <- moma_clean %>% group_by(Nationality) %>% dplyr::summarise(Total = dplyr::n()) %>% arrange(Total)

gender_na <- moma_clean %>% group_by (Gender) %>% dplyr::summarise(Total = dplyr::n()) %>% arrange(Total)

name_na <- moma_clean %>% group_by(DisplayName) %>% dplyr::summarise(Total = dplyr::n()) %>% arrange(Total)

# by manually search "unknown" and related keywords in the dataset, we caught on error: NAs in Nationality are also denoted as "nationality unknown"
moma_clean$Nationality[moma_clean$Nationality == "Nationality unknown"] <- NA

# Similarly, with the Name column, there are also some bad format NAs
moma_clean$DisplayName <- tolower(moma_clean$DisplayName)
moma_clean$DisplayName[str_detect(moma_clean$DisplayName, "unknown") == TRUE] <- NA

# with the gender column, we noticed that the biggest problem is with lowercase and upper case; also, since there is only 1 data observation on binary gender, we might consider dropping it later
moma_clean$Gender <- tolower(moma_clean$Gender)

colSums(is.na(moma_clean))
## DisplayName Nationality      Gender   BeginDate     EndDate 
##          53        2736        3179        3791       10779

We also checked the other three columns, Display Name, Nationality, and Gender, in case there are some human errors (such as entering the data in the wrong format) that might lead to NAs not being tracked by correctly. In fact, there were some for the Display Name and Nationality columns. Nas in the Display column are denoted with characters spelling out in words. Similarly, NAs in the Nationality column are also denoted in words. After cleaning them up, we uncovered 53 NAs in the Display Name column, 180 NAs in the Nationality column and above is the result of complete missing data values in this dataset. For the Gender column, no NAs are uncovered, but there is upper case and lower case issue such as “male” and “Male”, so we just turn all the gender types to lower case.

The result above is the total sum of all the missing data in this dataset. We will plot a graph to better understand the missing patterns.

# plot missing patterns
visna(moma_clean, sort = 'r')
## Warning in melt(as.data.frame(xs), ncol(xs)): The melt generic in data.table
## has been passed a data.frame and will attempt to redirect to the relevant
## reshape2 method; please note that reshape2 is deprecated, and this redirection
## is now deprecated as well. To continue using melt methods from reshape2 while
## both libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(as.data.frame(xs)). In the next version, this warning will become
## an error.

Concluded from the graph above, most rows are missing just the EndDate column, which means missing the death year of an artist. Not missing any data is the second most common pattern in this dataset. But surprisingly, missing all of the columns (other than Display Name) is the third most common patterns. Thus, there are quite decent amount of rows that we cannot use (since they are missing four of the columns that we want to look into). We will now proceed to explore the dataset while taken into account of the missing patterns.

## 1.2. missing data from moma artworks dataset
str(moma_artworks)
## Classes 'tbl_df', 'tbl' and 'data.frame':    138124 obs. of  29 variables:
##  $ Title             : chr  "Ferdinandsbrücke Project, Vienna, Austria (Elevation, preliminary version)" "City of Music, National Superior Conservatory of Music and Dance, Paris, France, View from interior courtyard" "Villa near Vienna Project, Outside Vienna, Austria, Elevation" "The Manhattan Transcripts Project, New York, New York, Introductory panel to Episode 1: The Park" ...
##  $ Artist            : chr  "Otto Wagner" "Christian de Portzamparc" "Emil Hoppe" "Bernard Tschumi" ...
##  $ ConstituentID     : chr  "6210" "7470" "7605" "7056" ...
##  $ ArtistBio         : chr  "(Austrian, 1841–1918)" "(French, born 1944)" "(Austrian, 1876–1957)" "(French and Swiss, born Switzerland 1944)" ...
##  $ Nationality       : chr  "(Austrian)" "(French)" "(Austrian)" "()" ...
##  $ BeginDate         : chr  "(1841)" "(1944)" "(1876)" "(1944)" ...
##  $ EndDate           : chr  "(1918)" "(0)" "(1957)" "(0)" ...
##  $ Gender            : chr  "(Male)" "(Male)" "(Male)" "(Male)" ...
##  $ Date              : chr  "1896" "1987" "1903" "1980" ...
##  $ Medium            : chr  "Ink and cut-and-pasted painted pages on paper" "Paint and colored pencil on print" "Graphite, pen, color pencil, ink, and gouache on tracing paper" "Photographic reproduction with colored synthetic laminate" ...
##  $ Dimensions        : chr  "19 1/8 x 66 1/2\" (48.6 x 168.9 cm)" "16 x 11 3/4\" (40.6 x 29.8 cm)" "13 1/2 x 12 1/2\" (34.3 x 31.8 cm)" "20 x 20\" (50.8 x 50.8 cm)" ...
##  $ CreditLine        : chr  "Fractional and promised gift of Jo Carole and Ronald S. Lauder" "Gift of the architect in honor of Lily Auchincloss" "Gift of Jo Carole and Ronald S. Lauder" "Purchase and partial gift of the architect in honor of Lily Auchincloss" ...
##  $ AccessionNumber   : chr  "885.1996" "1.1995" "1.1997" "2.1995" ...
##  $ Classification    : chr  "Architecture" "Architecture" "Architecture" "Architecture" ...
##  $ Department        : chr  "Architecture & Design" "Architecture & Design" "Architecture & Design" "Architecture & Design" ...
##  $ DateAcquired      : Date, format: "1996-04-09" "1995-01-17" ...
##  $ Cataloged         : chr  "Y" "Y" "Y" "Y" ...
##  $ ObjectID          : int  2 3 4 5 6 7 8 9 10 11 ...
##  $ URL               : chr  "http://www.moma.org/collection/works/2" "http://www.moma.org/collection/works/3" "http://www.moma.org/collection/works/4" "http://www.moma.org/collection/works/5" ...
##  $ ThumbnailURL      : chr  "http://www.moma.org/media/W1siZiIsIjU5NDA1Il0sWyJwIiwiY29udmVydCIsIi1yZXNpemUgMzAweDMwMFx1MDAzZSJdXQ.jpg?sha=137b8455b1ec6167" "http://www.moma.org/media/W1siZiIsIjk3Il0sWyJwIiwiY29udmVydCIsIi1yZXNpemUgMzAweDMwMFx1MDAzZSJdXQ.jpg?sha=55b65fa4368fe00a" "http://www.moma.org/media/W1siZiIsIjk4Il0sWyJwIiwiY29udmVydCIsIi1yZXNpemUgMzAweDMwMFx1MDAzZSJdXQ.jpg?sha=fdcfca4db3acac1f" "http://www.moma.org/media/W1siZiIsIjEyNCJdLFsicCIsImNvbnZlcnQiLCItcmVzaXplIDMwMHgzMDBcdTAwM2UiXV0.jpg?sha=c89b9071486760a5" ...
##  $ Circumference (cm): chr  NA NA NA NA ...
##  $ Depth (cm)        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Diameter (cm)     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Height (cm)       : num  48.6 40.6 34.3 50.8 38.4 ...
##  $ Length (cm)       : chr  NA NA NA NA ...
##  $ Weight (kg)       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Width (cm)        : num  168.9 29.8 31.8 50.8 19.1 ...
##  $ Seat Height (cm)  : chr  NA NA NA NA ...
##  $ Duration (sec.)   : chr  NA NA NA NA ...
##  - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 2 obs. of  5 variables:
##   ..$ row     : int  128291 134545
##   ..$ col     : chr  "DateAcquired" "DateAcquired"
##   ..$ expected: chr  "date like " "date like "
##   ..$ actual  : chr  "1977-08" "1961"
##   ..$ file    : chr  "'../data/raw/moma/Artworks.csv'" "'../data/raw/moma/Artworks.csv'"
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 29
##   .. ..$ Title             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Artist            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ ConstituentID     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ ArtistBio         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Nationality       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ BeginDate         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ EndDate           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Gender            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Date              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Medium            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Dimensions        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ CreditLine        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ AccessionNumber   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Classification    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Department        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ DateAcquired      :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_date" "collector"
##   .. ..$ Cataloged         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ ObjectID          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ URL               : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ ThumbnailURL      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Circumference (cm): list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Depth (cm)        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Diameter (cm)     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Height (cm)       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Length (cm)       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Weight (kg)       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Width (cm)        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Seat Height (cm)  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Duration (sec.)   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

We have 138124 observations with 29 variables. But again, some columns are redundant or not significantly meaningful in our case, so we will drop those.

# we chose title, artist, beginDate, endDate, Gender, Date Made, Medium, Classification, Acquisition Year
artworks_clean <- moma_artworks[c(1,2,5,6,7,8,9,10,14,16)]

# check missing
colSums(is.na(artworks_clean))
##          Title         Artist    Nationality      BeginDate        EndDate 
##             39           1455           1455           1455           1455 
##         Gender           Date         Medium Classification   DateAcquired 
##           1455           2370          10963              0           6741

So we almost have missings from all columns other than Classification column.

# fix NA format

# clean data that are bracked by ()
artworks_clean$Nationality <- removePunctuation(artworks_clean$Nationality)
artworks_clean$BeginDate <- removePunctuation(artworks_clean$BeginDate)
artworks_clean$EndDate <- removePunctuation(artworks_clean$EndDate)
artworks_clean$Gender <- removePunctuation(artworks_clean$Gender)

# add columns for acquisition year
artworks_clean <- artworks_clean %>% mutate(YearAcquired = substr(DateAcquired,1,4))

# for clarity, we remove the original column
artworks_clean <- dplyr::select(artworks_clean,-DateAcquired)

# Nationality and Gender columns have many empty rows so we use *is.empty* from *rapportools*
artworks_clean$Nationality[is.empty(artworks_clean$Nationality) == TRUE] <- NA
artworks_clean$Gender[is.empty(artworks_clean$Gender) == TRUE] <- NA
artworks_clean$Gender <- tolower(artworks_clean$Gender)

# BeginDate and EndDate have many 0 rows to indicate NAs
artworks_clean$BeginDate[artworks_clean$BeginDate == 0] <- NA
artworks_clean$EndDate[artworks_clean$EndDate == 0] <- NA

# clean "unknown" in Date column
artworks_clean$Date[artworks_clean$Date == "unknown"] <- NA

# we use the same mechanism that detect the most number of NAs (increase false negative, but make sure we increase true positive)
artworks_clean$Title[str_detect(artworks_clean$Title, "unknown") == TRUE] <- NA
artworks_clean$Artist[str_detect(artworks_clean$Artist, "unknown") == TRUE] <- NA
artworks_clean$Medium[str_detect(artworks_clean$Medium, "unknown") == TRUE] <- NA
artworks_clean$Title[str_detect(artworks_clean$Title, "Unknown") == TRUE] <- NA
artworks_clean$Artist[str_detect(artworks_clean$Artist, "Unknown") == TRUE] <- NA
artworks_clean$Medium[str_detect(artworks_clean$Medium, "Unknown") == TRUE] <- NA
artworks_clean$Nationality[str_detect(artworks_clean$Nationality, "Unknown") == TRUE] <- NA
artworks_clean$Nationality[str_detect(artworks_clean$Nationality, "unknown") == TRUE] <- NA
artworks_clean$Date[artworks_clean$Date == "n.d."] <- NA

colSums(is.na(artworks_clean))
##          Title         Artist    Nationality      BeginDate        EndDate 
##             85           5255           7448           9150          48431 
##         Gender           Date         Medium Classification   YearAcquired 
##           8518           3144          11005              0           6741
visna(artworks_clean, sort = "b")
## Warning in melt(as.data.frame(xs), ncol(xs)): The melt generic in data.table
## has been passed a data.frame and will attempt to redirect to the relevant
## reshape2 method; please note that reshape2 is deprecated, and this redirection
## is now deprecated as well. To continue using melt methods from reshape2 while
## both libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(as.data.frame(xs)). In the next version, this warning will become
## an error.

# just to be safe, we group them again.
date_test <- artworks_clean %>% group_by(Date) %>% dplyr::summarise(Total = dplyr::n()) %>% arrange(Total)
aqr_test <- artworks_clean %>% group_by(YearAcquired) %>% dplyr::summarise(Total = dplyr::n()) %>% arrange(Total)

From the graph, EndDate is the column with most missing data, followed by Medium, BeginDate and Gender. To our surprise, Year Acquisition and Date Made have much fewer missing rows than other columns. This can point to the fact that when MOMA collected art pieces, they cared more about when the object was made, rather than the biography of the artist himself or herself (such as birth year, death year, gender, or even medium!)

# First, what is MOMA's preference on Nationality?
moma_org <- moma_clean[2] %>% group_by(Nationality) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency) 

# check the stats
describe(moma_org)
## moma_org 
## 
##  2  Variables      127  Observations
## --------------------------------------------------------------------------------
## Nationality 
##        n  missing distinct 
##      126        1      126 
## 
## lowest : Afghan     Albanian   Algerian   American   Argentine 
## highest: Venezuelan Vietnamese Welsh      Yugoslav   Zimbabwean
## --------------------------------------------------------------------------------
## Frequency 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      127        0       53    0.986    124.8    225.3      1.0      1.0 
##      .25      .50      .75      .90      .95 
##      2.0      6.0     44.0    165.0    446.2 
## 
## lowest :    1    2    3    4    5, highest:  872  880  977 2736 5472
## --------------------------------------------------------------------------------

Of the 15,853 observations (including NAs), 95% of the data are less than the frequency of the top 5% of the nationalities, where the cut off is only 446 counts. Thus, we are definitely seeing a long-tail effect, that of the 15,853 observations, most of them are from the same countries on the top 5% list.

# take the top 5% that can represent the nationality column
moma_org5 <- moma_org %>% group_by (Frequency) %>% filter(Frequency > 446)

# filter out na for now
moma_org5 <- moma_org5 %>% group_by(Nationality) %>% filter(is.na(Nationality) == FALSE)
  
# frequency of nationality
ggplot(moma_org5, aes(fct_reorder(Nationality, Frequency), Frequency)) +
  geom_bar(stat = "identity") +
  ggtitle("MOMA All Time Favorite Nationality", subtitle = "Top 5% Frequency of Nationality") +
  xlab("Nationality") +
  geom_col(color = mycolor, fill = myfill) +
  coord_flip() +
  theme(plot.title = element_text(face = "bold"))

theme_dotplot <- theme_bw(13) +
  theme(axis.text.y = element_text(size = rel(.75)),
  axis.ticks.y = element_blank(),
  axis.title.x = element_text(size = rel(.75)), 
  panel.grid.major.x = element_blank(), 
  panel.grid.major.y = element_line(size = 0.5), 
  panel.grid.minor.x = element_blank())

moma_org5 %>%
  ggplot(aes(Frequency, fct_reorder(Nationality, Frequency))) +
  geom_point(color = "red") +
  ggtitle("MOMA All Time Favorite Nationality", subtitle = "Top 5% Frequency of Nationality") +
  xlab("") +
  ylab("") +
  theme_dotplot

We observed that the number of American artists is nearly 5 times the number of artists from other countries (5472 counts), which means one-third of the artists featured by MOMA are from the U.S..

# if by top 10%, which is 165
moma_org10 <- moma_org %>% group_by (Frequency) %>% filter(Frequency > 165)
# filter out NA for now
moma_org10 <- moma_org10 %>% group_by (Nationality) %>% filter(is.na(Nationality)== FALSE)
# frequency of nationality
ggplot(moma_org10, aes(fct_reorder(Nationality, Frequency), Frequency)) +
  geom_bar(stat = "identity") +
  ggtitle("MOMA Artist Nationality by Frequency (top 10%)") +
  xlab("Nationality") +
  geom_col(color = mycolor, fill = myfill) +
  coord_flip() +
  theme(plot.title = element_text(face = "bold"))

moma_org10 %>%
  ggplot(aes(Frequency, fct_reorder(Nationality, Frequency))) +
  geom_point(color = "red") +
  ggtitle("MOMA All Time Favorite Nationality", subtitle = "Top 5% Frequency of Nationality") +
  xlab("") +
  ylab("") +
  theme_dotplot

Thus, we observed that among the 90% of the artworks collected by MOMA (which are 12 countries), 8 are European countries, 2 are North American countries, 1 is South American and 1 is Asian.

# Now, let's check by gender of those top 5%
moma_org5_gender <- moma_clean %>% group_by(Gender, Nationality) %>% dplyr::summarize(Frequency= dplyr::n()) %>% arrange(Frequency) %>% filter(Nationality %in% c("American", "British", "German", "Italian", "Japanese"))
#if exclude NA for now
moma_org5_gender <- moma_org5_gender %>% group_by(Gender) %>% filter(Gender %in% c("female", "male"))
ggplot(moma_org5_gender, aes(Nationality, Frequency, fill = Gender)) +
  geom_bar(stat = "identity") +
  ggtitle("MOMA All Time Favorite Nationality, Colored by Gender (top 5%)") +
  xlab("Nationality") +
  scale_fill_brewer(palette= "Oranges")+
  geom_col(color = mycolor) +
  coord_flip() +
  theme(plot.title = element_text(face = "bold"))

moma_org5_gender_mosaic <- moma_org5_gender
colnames(moma_org5_gender_mosaic)[which(names(moma_org5_gender_mosaic) == "Frequency")] <- "Freq"
vcd::mosaic(Gender ~ Nationality,
            direction = c("v", "h"),
            gp = gpar(fill=c("#E5F9E0", "#A3F7B5")),
            labeling = labeling_border(rot_labels = c(30, 0, 0, 90)),
            moma_org5_gender_mosaic)

# dplyr::ext, we look at Birth Year
moma_birthy <- moma_clean[4] %>% group_by (BeginDate) %>% dplyr::summarize(Frequency= dplyr::n()) %>% arrange(Frequency)
moma_birthy$BeginDate <- as.character(moma_birthy$BeginDate)
# again, check the stats
describe(moma_birthy)
## moma_birthy 
## 
##  2  Variables      234  Observations
## --------------------------------------------------------------------------------
## BeginDate 
##        n  missing distinct 
##      233        1      233 
## 
## lowest : 1730 1731 1746 1753 1765, highest: 2012 2014 2015 2016 2017
## --------------------------------------------------------------------------------
## Frequency 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      234        0      105    0.999    67.75    90.84     1.00     2.00 
##      .25      .50      .75      .90      .95 
##     6.00    22.50    94.25   138.40   153.00 
## 
## lowest :    1    2    3    4    5, highest:  177  184  193  198 3791
##                                               
## Value          0    50   100   150   200  3800
## Frequency    121    33    42    33     4     1
## Proportion 0.517 0.141 0.179 0.141 0.017 0.004
## 
## For the frequency table, variable is rounded to the nearest 50
## --------------------------------------------------------------------------------

Again, we take the 95% percentile, and see a similar result, that 95% of the birth years are fewer than 153 counts, which means most of the artists are from similar or identical birh years, which is quite surprising.

# let's see the top 5%
moma_birthy5 <- moma_birthy %>% group_by(Frequency) %>% filter(Frequency > 153)
# and exclude NA for now
moma_birthy5 <- moma_birthy5 %>% group_by(BeginDate) %>% filter(is.na(BeginDate) == FALSE)
# frequency of birth year
ggplot(moma_birthy5, aes(BeginDate, Frequency)) +
  geom_bar(stat = "identity") +
  ggtitle("MOMA Artist Birth Year by Frequency (top 5%)") +
  xlab("Birth Year") +
  geom_col(color = mycolor, fill = myfill) +
  # coord_flip() +
  theme(plot.title = element_text(face = "bold"))

We observed that these ten consequentive years (excluding 1945) accounts for the birth year of the artists that are most recognized by MOMA. Perhaps, these ten years really made artists creative :)

# let's see the top 10%, which is greater than 138
moma_birthy10 <- moma_birthy %>% group_by(Frequency) %>% filter(Frequency > 138)
# and exclude NA for now
moma_birthy10 <- moma_birthy10 %>% group_by(BeginDate) %>% filter(is.na(BeginDate) == FALSE)
# frequency of birth year
ggplot(moma_birthy10, aes(BeginDate, Frequency)) +
  geom_bar(stat = "identity") +
  ggtitle("MOMA Artist Birth Year by Frequency (top 10%)") +
  xlab("Birth Year") +
  geom_col(color = mycolor, fill = myfill) +
  # coord_flip() +
  theme_minimal() #+

  # theme(plot.title = element_text(face = "bold"))

Different from Nationality though, birth year is in fact a range of years with similar frequencies. For sake of simplicity, we take the years starting from (whatever years we decide)

# we plot by gender
moma_birthy5_gender <- moma_clean %>% 
  filter( !is.na(BeginDate)) %>%
  filter(Gender %in% c("female", "male")) %>%
  group_by(Gender, BeginDate) %>% 
  dplyr::summarize(Frequency=dplyr::n()) %>%
  ungroup %>%
  spread(Gender, Frequency)
# just check the top 5%
moma_birthy5_gender$BeginDate <- factor(moma_birthy5_gender$BeginDate, levels = moma_birthy5$BeginDate)

div(
  plot_ly(moma_birthy5_gender, x = ~male, y = ~fct_reorder(BeginDate, male+female), type = 'bar', orientation = 'h', name = 'male', marker = list(color = '#2678B2',
                      line = list(color = '#2678B2',
                                  width = 1))) %>%
    add_trace(x = ~female, name = 'female',
            marker = list(color = '#3ba9f7',
                          line = list(color = '#3ba9f7',
                                      width = 1))) %>%
  layout(barmode = 'stack',
         title = list(text = "MOMA Artist Birth Year by Gender (top 5%)", font = list(size = 16)),
         xaxis = list(title = list(text = "Count", font = list(size = 14)),
                      showgrid = TRUE,
                      showline = FALSE,
                      showticklabels = TRUE,
                      ticks = ''),
         yaxis = list(title = list(text = "Birth Year", font = list(size = 14)),
                      showgrid = FALSE,
                      showline = TRUE,
                      showticklabels = TRUE,
                      ticks = ''),
         legend = list(x = 0.8, 
                       y = 0, 
                       orientation = 'v')
  ), align = 'center')
# if plot on time series
moma_birthy_gen_time <- moma_clean %>% 
  filter(Gender %in% c("female", "male")) %>% 
  filter(BeginDate >= 1900) %>%
  group_by(Gender, BeginDate) %>% 
  dplyr::summarize(Frequency=dplyr::n()) %>%
  ungroup() %>%
  spread(Gender, Frequency)

div(
  plot_ly(moma_birthy_gen_time, x = ~BeginDate) %>%
  add_trace(y = ~female, name ='Female', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~male, name ='Male', type = 'scatter', mode = 'lines') %>%
  layout(title = list(text = "MOMA Artists Gender Comparison by Birth Year", font = list(size = 16)),
         xaxis = list(title = list(text = "Birth Year", font = list(size = 14)),
                      showline = FALSE,
                      showgrid = FALSE,
                      showticklabels = TRUE,
                      ticks = ''),
         yaxis = list(title = list(text = "Frequency", font = list(size = 14)),
                      gridcolor = 'lightgray',
                      showgrid = TRUE,
                      showline = FALSE,
                      showticklabels = TRUE,
                      ticks = '',
                      nticks = 7),
         legend = list(x = 0.7, 
                       y = 0.9, 
                       orientation = 'h')
  ), 
  align = 'center')
# calculate female-male ratio
moma_birthy_gen_ratio <- moma_clean %>%
  filter(Gender %in% c("female", "male")) %>%
  filter(BeginDate >= 1900 & BeginDate <= 1980) %>%
  group_by(BeginDate, Gender) %>% 
  dplyr::summarise(Frequency = dplyr::n()) %>%
  ungroup() %>%
  group_by(BeginDate) %>% 
  dplyr::mutate(Ratio = 100*Frequency[Gender=="female"]/Frequency[Gender == "male"]) %>%
  ungroup()

div(
  plot_ly(moma_birthy_gen_ratio, x = ~BeginDate, y = ~Ratio, type = 'scatter', mode = 'lines') %>%
  layout(title = list(text = "Timeseries Graph on Birth Year Female-Male Ratio (1900 - 1980)", font = list(size = 16)),
         xaxis = list(title = list(text = "Birth Year", font = list(size = 14)),
                      showline = FALSE,
                      showgrid = FALSE,
                      showticklabels = TRUE,
                      ticks = ''),
         yaxis = list(title = list(text = "Ratio (%)", font = list(size = 14)),
                      gridcolor = 'lightgray',
                      showgrid = TRUE,
                      showline = FALSE,
                      showticklabels = TRUE,
                      ticks = '',
                      nticks = 7)
  ), 
  align = 'center')

Gender Ratio Peaked at Year 1971 (62%)!

moma_artist_name <- moma_clean

# Capitalized for visualization
moma_artist_name$DisplayName <- capitalize(moma_artist_name$DisplayName)

They collected a lot from American artists. But who are they? Can we tell?

all_us <- moma_artist_name %>%
  filter(Nationality == 'American')
first_name_all_us <- as.data.frame(word(all_us$DisplayName, 1))
colnames(first_name_all_us) <- 'first_name'
first_name_all_us <- first_name_all_us %>%
  group_by(first_name) %>%
  dplyr::summarise(Frequency = dplyr::n()) %>%
  arrange(Frequency)

# first name frequency
div(
  plot_ly(tail(first_name_all_us, 10), x = ~Frequency, y = ~fct_reorder(first_name, Frequency), type = 'bar', orientation = 'h') %>%
  layout(title = list(text = "MOMA's All Time Favorite American First Names, by Frequency", font = list(size = 16)),
         xaxis = list(title = list(text = "Count", font = list(size = 14)),
                      showgrid = TRUE,
                      showline = FALSE,
                      showticklabels = TRUE,
                      ticks = ''),
         yaxis = list(title = list(text = "First Name", font = list(size = 14)),
                      showgrid = FALSE,
                      showline = TRUE,
                      showticklabels = TRUE,
                      ticks = '')
  ), align = 'center')
all_de <- moma_artist_name %>%
  filter(Nationality == 'German')
first_name_all_de <- as.data.frame(word(all_de$DisplayName, 1))
colnames(first_name_all_de) <- 'first_name'
first_name_all_de <- first_name_all_de %>%
  group_by(first_name) %>%
  dplyr::summarise(Frequency = dplyr::n()) %>%
  arrange(Frequency)
## Warning: Factor `first_name` contains implicit NA, consider using
## `forcats::fct_explicit_na`
# first name frequency
div(
  plot_ly(tail(first_name_all_de, 10), x = ~Frequency, y = ~fct_reorder(first_name, Frequency), type = 'bar', orientation = 'h') %>%
  layout(title = list(text = "MOMA's All Time Favorite German First Names, by Frequency", font = list(size = 16)),
         xaxis = list(title = list(text = "Count", font = list(size = 14)),
                      showgrid = TRUE,
                      showline = FALSE,
                      showticklabels = TRUE,
                      ticks = ''),
         yaxis = list(title = list(text = "First Name", font = list(size = 14)),
                      showgrid = FALSE,
                      showline = TRUE,
                      showticklabels = TRUE,
                      ticks = '')
  ), align = 'center')
all_fr <- moma_artist_name %>%
  filter(Nationality == 'French')
first_name_all_fr <- as.data.frame(word(all_fr$DisplayName, 1))
colnames(first_name_all_fr) <- 'first_name'
first_name_all_fr <- first_name_all_fr %>%
  group_by(first_name) %>%
  dplyr::summarise(Frequency = dplyr::n()) %>%
  arrange(Frequency)
## Warning: Factor `first_name` contains implicit NA, consider using
## `forcats::fct_explicit_na`
# first name frequency
div(
  plot_ly(tail(first_name_all_fr, 10), x = ~Frequency, y = ~fct_reorder(first_name, Frequency), type = 'bar', orientation = 'h') %>%
  layout(title = list(text = "MOMA's All Time Favorite French First Names, by Frequency", font = list(size = 16)),
         xaxis = list(title = list(text = "Count", font = list(size = 14)),
                      showgrid = TRUE,
                      showline = FALSE,
                      showticklabels = TRUE,
                      ticks = ''),
         yaxis = list(title = list(text = "First Name", font = list(size = 14)),
                      showgrid = FALSE,
                      showline = TRUE,
                      showticklabels = TRUE,
                      ticks = '')
  ), align = 'center')
all_uk <- moma_artist_name %>%
  filter(Nationality == 'British')
first_name_all_uk <- as.data.frame(word(all_uk$DisplayName, 1))
colnames(first_name_all_uk) <- 'first_name'
first_name_all_uk <- first_name_all_uk %>%
  group_by(first_name) %>%
  dplyr::summarise(Frequency = dplyr::n()) %>%
  arrange(Frequency)
## Warning: Factor `first_name` contains implicit NA, consider using
## `forcats::fct_explicit_na`
# first name frequency
div(
  plot_ly(tail(first_name_all_uk, 10), x = ~Frequency, y = ~fct_reorder(first_name, Frequency), type = 'bar', orientation = 'h') %>%
  layout(title = list(text = "MOMA's All Time Favorite British First Names, by Frequency", font = list(size = 16)),
         xaxis = list(title = list(text = "Count", font = list(size = 14)),
                      showgrid = TRUE,
                      showline = FALSE,
                      showticklabels = TRUE,
                      ticks = ''),
         yaxis = list(title = list(text = "First Name", font = list(size = 14)),
                      showgrid = FALSE,
                      showline = TRUE,
                      showticklabels = TRUE,
                      ticks = '')
  ), align = 'center')
all_jp <- moma_artist_name %>%
  filter(Nationality == 'Japanese')
first_name_all_jp <- as.data.frame(word(all_jp$DisplayName, 1))
colnames(first_name_all_jp) <- 'first_name'
first_name_all_jp <- first_name_all_jp %>%
  group_by(first_name) %>%
  dplyr::summarise(Frequency = dplyr::n()) %>%
  arrange(Frequency)
## Warning: Factor `first_name` contains implicit NA, consider using
## `forcats::fct_explicit_na`
# first name frequency
div(
  plot_ly(tail(first_name_all_jp, 10), x = ~Frequency, y = ~fct_reorder(first_name, Frequency), type = 'bar', orientation = 'h') %>%
  layout(title = list(text = "MOMA's All Time Favorite Japanese First Names, by Frequency", font = list(size = 16)),
         xaxis = list(title = list(text = "Count", font = list(size = 14)),
                      showgrid = TRUE,
                      showline = FALSE,
                      showticklabels = TRUE,
                      ticks = ''),
         yaxis = list(title = list(text = "First Name", font = list(size = 14)),
                      showgrid = FALSE,
                      showline = TRUE,
                      showticklabels = TRUE,
                      ticks = '')
  ), align = 'center')
# explore moma artworks
# to be clear though, for the sake of simplicity, we will only focus on artworks that made by single artists; so we drop all works made by multi artists

# Some pieces are done by more than one artist; check how many are there to decide what to do with them
artworks_single <- artworks_clean %>% 
  mutate(NumberArtists = lengths(strsplit(Artist, ","))) %>%
  mutate(NumberType = cut(NumberArtists, breaks = c(0,1,Inf), labels = c("Single", "Multiple"))) %>%
  filter(NumberType == 'Single')

str(artworks_single)
## Classes 'tbl_df', 'tbl' and 'data.frame':    130174 obs. of  12 variables:
##  $ Title         : chr  "Ferdinandsbrücke Project, Vienna, Austria (Elevation, preliminary version)" "City of Music, National Superior Conservatory of Music and Dance, Paris, France, View from interior courtyard" "Villa near Vienna Project, Outside Vienna, Austria, Elevation" "The Manhattan Transcripts Project, New York, New York, Introductory panel to Episode 1: The Park" ...
##  $ Artist        : chr  "Otto Wagner" "Christian de Portzamparc" "Emil Hoppe" "Bernard Tschumi" ...
##  $ Nationality   : chr  "Austrian" "French" "Austrian" NA ...
##  $ BeginDate     : chr  "1841" "1944" "1876" "1944" ...
##  $ EndDate       : chr  "1918" NA "1957" NA ...
##  $ Gender        : chr  "male" "male" "male" "male" ...
##  $ Date          : chr  "1896" "1987" "1903" "1980" ...
##  $ Medium        : chr  "Ink and cut-and-pasted painted pages on paper" "Paint and colored pencil on print" "Graphite, pen, color pencil, ink, and gouache on tracing paper" "Photographic reproduction with colored synthetic laminate" ...
##  $ Classification: chr  "Architecture" "Architecture" "Architecture" "Architecture" ...
##  $ YearAcquired  : chr  "1996" "1995" "1997" "1995" ...
##  $ NumberArtists : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ NumberType    : Factor w/ 2 levels "Single","Multiple": 1 1 1 1 1 1 1 1 1 1 ...

We have now 130174 observations.

artworks_org <- artworks_single[3] %>% 
  group_by(Nationality) %>% 
  dplyr::summarise(Frequency = dplyr::n()) %>%
  ungroup() %>%
  arrange(Frequency)
describe(artworks_org)
## artworks_org 
## 
##  2  Variables      143  Observations
## --------------------------------------------------------------------------------
## Nationality 
##        n  missing distinct 
##      142        1      142 
## 
## lowest :  American                     American American            American American  American  American French              American German            
## highest: Venezuelan                   Vietnamese                   Welsh                        Yugoslav                     Zimbabwean                  
## --------------------------------------------------------------------------------
## Frequency 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      143        0       76    0.991    910.3     1720      1.0      1.0 
##      .25      .50      .75      .90      .95 
##      2.0     13.0    130.5    804.8   2359.3 
## 
## lowest :     1     2     3     4     5, highest:  5585  7242  9270 22530 57234
## --------------------------------------------------------------------------------

Take last ten.

# exclude NA
artworks_org <- artworks_org %>%
  filter(!is.na(Nationality))

div(
  plot_ly(tail(artworks_org, 10), x = ~Frequency, y = ~fct_reorder(Nationality, Frequency), type = 'bar', orientation = 'h') %>%
  layout(title = list(text = "MOMA Artworks are Mostly Contributed by", font = list(size = 16)),
         xaxis = list(title = list(text = "Count", font = list(size = 14)),
                      showgrid = TRUE,
                      showline = FALSE,
                      showticklabels = TRUE,
                      ticks = ''),
         yaxis = list(title = list(text = "Nationality", font = list(size = 14)),
                      showgrid = FALSE,
                      showline = TRUE,
                      showticklabels = TRUE,
                      ticks = '')
  ), align = 'center')

So will some nationality contribute more artworks even though under represented? NO.

# lets also look at gender
artworks_gender <- artworks_single %>% 
  filter(Nationality %in% c("American", "French", "German","British", "Spanish", "Italian", "Japanese","Swiss","Russian", "Dutch")) %>%
  group_by(Gender, Nationality) %>% 
  dplyr::summarize(Frequency= dplyr::n()) %>% 
  ungroup() %>%
  arrange(Frequency)
#if exclude NA for now
artworks_gender_plot <- artworks_gender %>% 
  filter(Gender %in% c("female", "male")) %>%
  spread(Gender, Frequency)

div(
  plot_ly(artworks_gender_plot, x = ~male, y = ~fct_reorder(Nationality, male+female), type = 'bar', orientation = 'h', name = 'male', marker = list(color = '#2678B2',
                      line = list(color = '#2678B2',
                                  width = 1))) %>%
    add_trace(x = ~female, name = 'female',
            marker = list(color = '#3ba9f7',
                          line = list(color = '#3ba9f7',
                                      width = 1))) %>%
  layout(barmode = 'stack',
         title = list(text = "MOMA Artworks are Mostly Contributed by", font = list(size = 16)),
         xaxis = list(title = list(text = "Count", font = list(size = 14)),
                      showgrid = TRUE,
                      showline = FALSE,
                      showticklabels = TRUE,
                      ticks = ''),
         yaxis = list(title = list(text = "Nationality", font = list(size = 14)),
                      showgrid = FALSE,
                      showline = TRUE,
                      showticklabels = TRUE,
                      ticks = ''),
         legend = list(x = 0.8, 
                       y = 0, 
                       orientation = 'v')
  ), align = 'center')
artworks_gender_plot
## # A tibble: 10 x 3
##    Nationality female  male
##    <chr>        <int> <int>
##  1 American     11958 44646
##  2 British        474  4944
##  3 Dutch          194  1324
##  4 French         389 22040
##  5 German        1433  7718
##  6 Italian        232  2517
##  7 Japanese       344  2026
##  8 Russian        569  1329
##  9 Spanish         38  3050
## 10 Swiss          188  1798

So no.

aqr_freq <- artworks_single %>% 
  group_by(YearAcquired) %>% 
  dplyr::summarise(Frequency = dplyr::n()) %>%
  ungroup()
aqr_freq$YearAcquired <- strtoi(aqr_freq$YearAcquired)

div(
  plot_ly(aqr_freq, x = ~YearAcquired, y = ~Frequency, type = 'scatter', mode = 'lines') %>%
  layout(title = list(text = "Acquisition of French Artworks over Years", font = list(size = 16)),
         xaxis = list(title = list(text = "Acquisition Year", font = list(size = 14)),
                      showline = FALSE,
                      showgrid = FALSE,
                      showticklabels = TRUE,
                      ticks = ''),
         yaxis = list(title = list(text = "Frequency", font = list(size = 14)),
                      gridcolor = 'lightgray',
                      showgrid = TRUE,
                      showline = FALSE,
                      showticklabels = TRUE,
                      ticks = '',
                      nticks = 7)
  ), 
  align = 'center')

MOMA suddenly loved French Artworks around 1970.

# nationality frequency change over year
aqr_nationality <- artworks_single %>% 
  filter(Nationality %in% c("American", "French", "German","British", "Spanish", "Italian", "Japanese","Swiss","Russian", "Dutch")) %>%
  group_by(YearAcquired, Nationality) %>% 
  dplyr::summarise(Frequency = dplyr::n()) %>%
  ungroup() %>%
  spread(Nationality, Frequency)

aqr_nationality$YearAcquired <- strtoi(aqr_nationality$YearAcquired)

div(
  plot_ly(aqr_nationality, x = ~YearAcquired) %>%
  add_trace(y = ~American, name ='American', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~British, name ='British', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~Dutch, name ='Dutch', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~French, name ='French', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~German, name ='German', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~Italian, name ='Italian', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~Japanese, name ='Japanese', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~Russian, name ='Russian', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~Spanish, name ='Spanish', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~Swiss, name ='Swiss', type = 'scatter', mode = 'lines') %>%
  layout(title = list(text = "Acquisition Year Analysis on Nationality Frequency Change", font = list(size = 16)),
         xaxis = list(title = list(text = "Acquisition Year", font = list(size = 14)),
                      showline = FALSE,
                      showgrid = FALSE,
                      showticklabels = TRUE,
                      ticks = ''),
         yaxis = list(title = list(text = "Frequency", font = list(size = 14)),
                      gridcolor = 'lightgray',
                      showgrid = TRUE,
                      showline = FALSE,
                      showticklabels = TRUE,
                      ticks = '',
                      nticks = 7)
  ), 
  align = 'center')
# 100% stacked bar for collection of what year born
# mosaic -> female and male

aqr_gender <- artworks_single %>%
  filter(Gender %in% c("female", "male")) %>%
  group_by(YearAcquired, Gender) %>%
  dplyr::summarise(Frequency = dplyr::n()) %>%
  ungroup() %>%
  spread(Gender, Frequency)

aqr_gender$YearAcquired <- strtoi(aqr_gender$YearAcquired)

div(
  plot_ly(aqr_gender, x = ~YearAcquired) %>%
  add_trace(y = ~female, name ='Female', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~male, name ='Male', type = 'scatter', mode = 'lines') %>%
  layout(title = list(text = "Acquisition Year Analysis on Gender Frequency Change", font = list(size = 16)),
         xaxis = list(title = list(text = "Acquisition Year", font = list(size = 14)),
                      showline = FALSE,
                      showgrid = FALSE,
                      showticklabels = TRUE,
                      ticks = ''),
         yaxis = list(title = list(text = "Count", font = list(size = 14)),
                      gridcolor = 'lightgray',
                      showgrid = TRUE,
                      showline = FALSE,
                      showticklabels = TRUE,
                      ticks = '',
                      nticks = 7),
         legend = list(x = 0.75, 
                       y = 0.9, 
                       orientation = 'h')
  ), 
  align = 'center')

So the question becomes: who are those artists? A few of them? Or a bunch of them?

The years are 1968 and 1964.

# year around thos
peak_1970 <- artworks_single %>% group_by(YearAcquired, Nationality, Artist) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
peak_1970 <- peak_1970 %>% group_by(Nationality) %>% filter(Nationality == "French")
peak_1970 <- peak_1970 %>% group_by(YearAcquired) %>% filter(YearAcquired %in% c("1964", "1968"))
peak_1970 <- peak_1970 %>% group_by(Frequency) %>% filter(Frequency > 100)
peak_1970$YearAcquired <- strtoi(peak_1970$YearAcquired)

ggplot(peak_1970, aes(fct_reorder(Artist, Frequency), Frequency)) +
  geom_bar(position = "dodge", stat = "identity") +
  ggtitle("What happened in Year 1964 and 1968!?") +
  labs(x = "Frequency", y = "Artist") +
  geom_col(color = mycolor, fill = myfill) +
  facet_wrap(~YearAcquired) +
  theme_grey(14) +
  theme(plot.title = element_text(face = "bold")) +
  theme(plot.subtitle = element_text(face = "bold", color = "grey35")) +
  theme(plot.caption = element_text(color = "grey68"))

So we see that in year 1964, MOMA acquired artworks from different French artists, but in year 1968, they acquired mostly just from one artists, Eugène Atget.

# in addition, see those peak years for American artworks
us_artworks <- artworks_single %>% group_by(YearAcquired, Nationality, Artist) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
us_artworks <- us_artworks %>% group_by(Nationality) %>% filter(Nationality == "American")
us_artworks <- us_artworks %>% group_by(YearAcquired) %>% filter(YearAcquired %in% c("1974", "2008"))
us_artworks <- us_artworks %>% group_by(Frequency) %>% filter(Frequency > 100)
peak_1970$YearAcquired <- strtoi(peak_1970$YearAcquired)

ggplot(us_artworks, aes(fct_reorder(Artist, Frequency), Frequency)) +
  geom_bar(position = "dodge", stat = "identity") +
  ggtitle("What happened in Year 1964 and 1968!?") +
  labs(x = "Frequency", y = "Artist") +
  geom_col(color = mycolor, fill = myfill) +
  facet_wrap(~YearAcquired) +
  theme_grey(14) +
  theme(plot.title = element_text(face = "bold")) +
  theme(plot.subtitle = element_text(face = "bold", color = "grey35")) +
  theme(plot.caption = element_text(color = "grey68"))

# let's also see the overall artist number
artworks_artist <- artworks_single %>%
  filter(!is.na(Artist)) %>%
  group_by(Artist) %>%
  dplyr::summarise(Frequency = dplyr::n()) %>%
  arrange(Frequency)

div(
  plot_ly(tail(artworks_artist, 10), x = ~Frequency, y = ~fct_reorder(Artist, Frequency), type = 'bar', orientation = 'h') %>%
  layout(title = list(text = "Artists that Contributed Most Artworks to MOMA", font = list(size = 16)),
         xaxis = list(title = list(text = "Count", font = list(size = 14)),
                      showgrid = TRUE,
                      showline = FALSE,
                      showticklabels = TRUE,
                      ticks = ''),
         yaxis = list(title = list(text = "Artist Name", font = list(size = 14)),
                      showgrid = FALSE,
                      showline = TRUE,
                      showticklabels = TRUE,
                      ticks = '')
  ), align = 'center')
# see top classifications
class_freq <- artworks_single %>% 
  group_by(Classification) %>% 
  dplyr::summarise(Frequency = dplyr::n()) %>%
  ungroup %>%
  arrange(Frequency)

# see what classification they like and how they change over year on the top 10
artworks_class <- artworks_single %>%
  filter(Classification %in% c("Print", "Photograph", "Illustrated Book","Drawing","Design", "Architecture","Painting", "Video")) %>%
  group_by(YearAcquired, Classification) %>%
  dplyr::summarise(Frequency = dplyr::n()) %>%
  ungroup %>%
  arrange(Frequency) %>%
  spread(Classification, Frequency)
artworks_class$YearAcquired <- strtoi(artworks_class$YearAcquired)

div(
  plot_ly(artworks_class, x = ~YearAcquired) %>%
  add_trace(y = ~Print, name ='Print', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~Photograph, name ='Photograph', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~`Illustrated Book`, name ='Illustrated Book', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~Drawing, name ='Drawing', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~Design, name ='Design', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~Architecture, name ='Architecture', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~Painting, name ='Painting', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~Video, name ='Video', type = 'scatter', mode = 'lines') %>%
  layout(title = list(text = "Acquisition Year Analysis on Classification Frequency Change", font = list(size = 16)),
         xaxis = list(title = list(text = "Acquisition Year", font = list(size = 14)),
                      showline = FALSE,
                      showgrid = FALSE,
                      showticklabels = TRUE,
                      ticks = ''),
         yaxis = list(title = list(text = "Count", font = list(size = 14)),
                      gridcolor = 'lightgray',
                      showgrid = TRUE,
                      showline = FALSE,
                      showticklabels = TRUE,
                      ticks = '',
                      nticks = 7)
  ), 
  align = 'center')
artworks_med <- artworks_single %>% 
  filter(Medium %in% c("Gelatin silver print", "Lithograph", "Albumen silver print","Pencil on paper","Letterpress", "Etching","Chromogenic color print", "Lithograph, printed in color")) %>%
  group_by(YearAcquired, Medium) %>% 
  dplyr::summarise(Frequency = dplyr::n()) %>%
  ungroup() %>%
  spread(Medium, Frequency)
artworks_med$YearAcquired <- strtoi(artworks_med$YearAcquired)

div(
  plot_ly(artworks_med, x = ~YearAcquired) %>%
  add_trace(y = ~`Gelatin silver print`, name ='Gelatin silver print', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~Lithograph, name ='Lithograph', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~`Albumen silver print`, name ='Albumen silver print', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~`Pencil on paper`, name ='Pencil on paper', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~Letterpress, name ='Letterpress', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~Etching, name ='Etching', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~`Chromogenic color print`, name ='Chromogenic color print', type = 'scatter', mode = 'lines') %>%
  add_trace(y = ~`Lithograph, printed in color`, name ='Lithograph, printed in color', type = 'scatter', mode = 'lines') %>%
  layout(title = list(text = "Acquisition Year Analysis on Medium Frequency Change", font = list(size = 16)),
         xaxis = list(title = list(text = "Acquisition Year", font = list(size = 14)),
                      showline = FALSE,
                      showgrid = FALSE,
                      showticklabels = TRUE,
                      ticks = ''),
         yaxis = list(title = list(text = "Count", font = list(size = 14)),
                      gridcolor = 'lightgray',
                      showgrid = TRUE,
                      showline = FALSE,
                      showticklabels = TRUE,
                      ticks = '',
                      nticks = 7)
  ), 
  align = 'center')

Gelatin Silver Print can be their New Favorite!

aqr_birth <- artworks_single %>% group_by(YearAcquired,BeginDate) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
aqr_birth$YearAcquired <- strtoi(aqr_birth$YearAcquired)

ggplot(aqr_birth, aes(YearAcquired, Frequency, color = BeginDate)) +
  geom_line() + 
  ggtitle("Gelatin Silver Print can be their New Favorite!", subtitle= "Acquisition Year Analysis on Gender Frequency Change") +
  labs(x = "Year", y = "Frequency") +
  theme_grey(14) +
  theme(plot.title = element_text(face = "bold")) +
  theme(plot.subtitle = element_text(face = "bold", color = "grey35")) +
  theme(plot.caption = element_text(color = "grey68"))
## Warning: Removed 123 rows containing missing values (geom_path).

# trying to see if there is pattern of acq of artist birth year
artworks_mos <- artworks_single[c(3,4,6,9,10)]
artworks_mos <- artworks_mos %>% mutate(BirthIdx = (BeginDate < 1950))
artworks_mos$BirthIdx[(artworks_mos$BirthIdx)== TRUE] <- 1
artworks_mos$BirthIdx[(artworks_mos$BirthIdx)== FALSE] <- 3
artworks_mos <- artworks_mos %>% mutate(BirthType = cut(BirthIdx, breaks = c(0,1,Inf), labels = c("First Half", "Second Half")))
artworks_mos <- artworks_mos %>% group_by(Gender, BirthType) %>% dplyr::summarise(Frequency = dplyr::n())
## Warning: Factor `BirthType` contains implicit NA, consider using
## `forcats::fct_explicit_na`
vcd::mosaic(BirthType ~ Gender, artworks_mos)